# Copyright (c) 2019 Matthew Help, Mathieu Tanneau and contributors              #src
#                                                                                #src
# Permission is hereby granted, free of charge, to any person obtaining a copy   #src
# of this software and associated documentation files (the "Software"), to deal  #src
# in the Software without restriction, including without limitation the rights   #src
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell      #src
# copies of the Software, and to permit persons to whom the Software is          #src
# furnished to do so, subject to the following conditions:                       #src
#                                                                                #src
# The above copyright notice and this permission notice shall be included in all #src
# copies or substantial portions of the Software.                                #src
#                                                                                #src
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR     #src
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,       #src
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE    #src
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER         #src
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,  #src
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE  #src
# SOFTWARE.                                                                      #src

# # Geographical clustering

# **This tutorial was originally contributed by Matthew Helm and Mathieu Tanneau.**

# The goal of this exercise is to cluster $n$ cities into $k$ groups, minimizing
# the total pairwise distance between cities *and* ensuring that the variance in
# the total populations of each group is relatively small.

# This tutorial uses the following packages:
using JuMP
import DataFrames
import HiGHS
import LinearAlgebra

# For this example, we'll use the 20 most populous cities in the United States.
cities = DataFrames.DataFrame(
    Union{String,Float64}[
        "New York, NY" 8.405 40.7127 -74.0059
        "Los Angeles, CA" 3.884 34.0522 -118.2436
        "Chicago, IL" 2.718 41.8781 -87.6297
        "Houston, TX" 2.195 29.7604 -95.3698
        "Philadelphia, PA" 1.553 39.9525 -75.1652
        "Phoenix, AZ" 1.513 33.4483 -112.0740
        "San Antonio, TX" 1.409 29.4241 -98.4936
        "San Diego, CA" 1.355 32.7157 -117.1610
        "Dallas, TX" 1.257 32.7766 -96.7969
        "San Jose, CA" 0.998 37.3382 -121.8863
        "Austin, TX" 0.885 30.2671 -97.7430
        "Indianapolis, IN" 0.843 39.7684 -86.1580
        "Jacksonville, FL" 0.842 30.3321 -81.6556
        "San Francisco, CA" 0.837 37.7749 -122.4194
        "Columbus, OH" 0.822 39.9611 -82.9987
        "Charlotte, NC" 0.792 35.2270 -80.8431
        "Fort Worth, TX" 0.792 32.7554 -97.3307
        "Detroit, MI" 0.688 42.3314 -83.0457
        "El Paso, TX" 0.674 31.7775 -106.4424
        "Memphis, TN" 0.653 35.1495 -90.0489
    ],
    ["city", "population", "lat", "lon"],
)

# ### Model Specifics

# We will cluster these 20 cities into 3 different groups and we will assume
# that the ideal or target population $P$ for a group is simply the total
# population of the 20 cities divided by 3:

n = size(cities, 1)
k = 3
P = sum(cities.population) / k

# ### Obtaining the distances between each city

# Let's compute the pairwise Haversine distance between each of the cities in
# our data set and store the result in a variable we'll call `dm`:

"""
    haversine(lat1, long1, lat2, long2, r = 6372.8)

Compute the haversine distance between two points on a sphere of radius `r`,
where the points are given by the latitude/longitude pairs `lat1/long1` and
`lat2/long2` (in degrees).
"""
function haversine(lat1, long1, lat2, long2, r = 6372.8)
    lat1, long1 = deg2rad(lat1), deg2rad(long1)
    lat2, long2 = deg2rad(lat2), deg2rad(long2)
    hav(a, b) = sin((b - a) / 2)^2
    inner_term = hav(lat1, lat2) + cos(lat1) * cos(lat2) * hav(long1, long2)
    d = 2 * r * asin(sqrt(inner_term))
    ## Round distance to nearest kilometer.
    return round(Int, d)
end

# Our distance matrix is symmetric so we'll convert it to a `LowerTriangular`
# matrix so that we can better interpret the objective value of our model:

dm = LinearAlgebra.LowerTriangular([
    haversine(cities.lat[i], cities.lon[i], cities.lat[j], cities.lon[j])
    for i in 1:n, j in 1:n
])

# ### Build the model

# Now that we have the basics taken  care of, we can set up our model, create
# decision variables, add constraints, and then solve.

# First, we'll set up a model that leverages the Cbc solver. Next, we'll set up
# a binary variable $x_{i,k}$ that takes the value $1$ if city $i$ is in group
# $k$ and $0$ otherwise. Each city must be in a group, so we'll add the
# constraint $\sum_k x_{i,k} = 1$ for every $i$.

model = Model(HiGHS.Optimizer)
set_silent(model)
@variable(model, x[1:n, 1:k], Bin)
@constraint(model, [i = 1:n], sum(x[i, :]) == 1);
## To reduce symmetry, we fix the first city to belong to the first group.
fix(x[1, 1], 1; force = true)

# The total population of a group $k$ is $Q_k = \sum_ix_{i,k}q_i$ where $q_i$ is
# simply the $i$-th value from the `population` column in our `cities` DataFrame.
# Let's add constraints so that $\alpha \leq (Q_k - P) \leq \beta$. We'll set
# $\alpha$ equal to $-3$ million and $\beta$ equal to $3$. By adjusting
# these thresholds you'll find that there is a tradeoff between having
# relatively even populations between groups and having geographically close
# cities within each group. In other words, the larger the absolute values of
# $\alpha$ and $\beta$, the closer together the cities in a group will be but
# the variance between the group populations will be higher.

@variable(model, -3 <= population_diff[1:k] <= 3)
@constraint(model, population_diff .== x' * cities.population .- P)

# Now we need to add one last binary variable $z_{i,j}$ to our model that we'll
# use to compute the total distance between the cities in our groups, defined as
# $\sum_{i,j}d_{i,j}z_{i,j}$. Variable $z_{i,j}$ will equal $1$ if cities $i$
# and $j$ are in the same group, and $0$ if they are not in the same group.

# To ensure that $z_{i,j} = 1$ if and only if cities $i$ and $j$ are in the same
# group, we add the constraints $z_{i,j} \geq x_{i,k} + x_{j,k} - 1$ for every
# pair $i,j$ and every $k$:

@variable(model, z[i=1:n, j=1:i], Bin)
for k in 1:k, i in 1:n, j in 1:i
    @constraint(model, z[i, j] >= x[i, k] + x[j, k] - 1)
end

# We can now add an objective to our model which will simply be to minimize the
# dot product of $z$ and our distance matrix, `dm`.

@objective(model, Min, sum(dm[i, j] * z[i, j] for i in 1:n, j in 1:i));

# We can then call `optimize!` and review the results.

optimize!(model)
assert_is_solved_and_feasible(model)

# ### Reviewing the Results

# Now that we have results, we can add a column to our `cities` DataFrame for
# the group and then loop through our $x$ variable to assign each city to its
# group. Once we have that, we can look at the total population for each group
# and also look at the cities in each group to verify that they are grouped by
# geographic proximity.

cities.group = zeros(n)

for i in 1:n, j in 1:k
    if round(Int, value(x[i, j])) == 1
        cities.group[i] = j
    end
end

for group in DataFrames.groupby(cities, :group)
    @show group
    println("")
    @show sum(group.population)
    println("")
end
